{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 利用 Tf-idf 提取特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from gensim import corpora, models, similarities\n",
    "import numpy as np\n",
    "from pprint import pprint\n",
    "import logging\n",
    "from gensim import corpora\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 变量设定"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputPath = \"output/\"\n",
    "isDemo = False\n",
    "if isDemo:\n",
    "    commonPath = \"data/\"\n",
    "    trainPath = commonPath+\"train_set_demo.csv\"\n",
    "    testPath = commonPath+\"test_set_demo.csv\"\n",
    "    trainNewPath = outputPath+\"train_set_demo_new.csv\"\n",
    "    testNewPath = outputPath+\"test_set_demo_new.csv\"\n",
    "else:\n",
    "    commonPath = \"../data/\"\n",
    "    trainPath = commonPath+\"train_set.csv\"\n",
    "    testPath = commonPath+\"test_a.csv\"\n",
    "    trainNewPath = outputPath+\"train_set_new.csv\"\n",
    "    testNewPath = outputPath+\"test_a_new.csv\"\n",
    "\n",
    "dict_dataframe_path = outputPath+\"dict_dataframe.csv\"\n",
    "fileName = 'all'\n",
    "freq = 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 函数定义"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 功能 csv 数据 加载\n",
    "def loadCSVData(fileName):\n",
    "    df = pd.read_csv(fileName,encoding=\"utf-8\",sep=\"\\t\")\n",
    "    return df\n",
    "\n",
    "# 数据预处理\n",
    "def data_process(documents,freq=5):\n",
    "    texts = [[word for word in document.split(\" \")] for document in documents]\n",
    "    # 去掉只出现低于 freq 次的单词\n",
    "    frequency = defaultdict(int)\n",
    "    for text in texts:\n",
    "        for token in text:\n",
    "            frequency[token] += 1\n",
    "            \n",
    "    texts = [[token for token in text if frequency[token] > freq]\n",
    "             for text in texts]\n",
    "    return texts\n",
    "\n",
    "# 功能：加载 词典\n",
    "def get_dict(texts,outputPath,fileName):\n",
    "    '''\n",
    "        功能：加载 词典\n",
    "        input:\n",
    "            texts  String List  分词后的 文本 \n",
    "        ouput:\n",
    "            dictionary : Dict   词典\n",
    "    '''\n",
    "    dictionary = corpora.Dictionary(texts)   # 生成词典 \n",
    "    # 将文档存入字典，字典有很多功能，比如\n",
    "    # diction.token2id 存放的是单词-id key-value对\n",
    "    # diction.dfs 存放的是单词的出现频率\n",
    "    dictionary.save(outputPath+fileName+'.dict')  # store the dictionary, for future reference\n",
    "    corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "    corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "    corpora.MmCorpus.serialize(outputPath+fileName+'.mm', corpus)  # store to disk, for later use\n",
    "    return dictionary\n",
    "\n",
    "# 打印字典\n",
    "def PrintDictionary(dictionary,dict_dataframe_path):\n",
    "    token2id = dictionary.token2id\n",
    "    dfs = dictionary.dfs\n",
    "    token_info = {}\n",
    "    for word in token2id:\n",
    "        token_info[word] = dict(\n",
    "            word = word,\n",
    "            id = token2id[word],\n",
    "            freq = dfs[token2id[word]]\n",
    "        )\n",
    "    token_items = token_info.values()\n",
    "    token_items = sorted(token_items, key = lambda x:x['freq'])\n",
    "    print('The info of dictionary: ')\n",
    "    print(\"len(token_items):{0}\".format(len(token_items)))\n",
    "    pprint(token_items)\n",
    "    pd.DataFrame(token_items).to_csv(dict_dataframe_path,encoding=\"utf-8\")\n",
    "#     print(df)\n",
    "    print('--------------------------')\n",
    "\n",
    "# 数据 特征提取\n",
    "def text_feature_ex(text,feature_names,minFreq,sep=\" \"):\n",
    "    new_text = []\n",
    "    for word in text.split(sep):\n",
    "        if word in feature_names:\n",
    "            new_text.append(word)\n",
    "    if len(new_text)>=minFreq:\n",
    "        return sep.join(new_text)\n",
    "    else:\n",
    "        return text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainDf = loadCSVData(trainPath)\n",
    "testDf = loadCSVData(testPath)\n",
    "alldocument = list(trainDf['text'])+list(testDf['text'])\n",
    "texts = data_process(alldocument,freq=1)\n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = get_dict(texts,outputPath,fileName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Used files generated from first tutorial\n",
      "The info of dictionary: \n",
      "len(token_items):3561\n",
      "[{'freq': 1, 'id': 640, 'word': '5605'},\n",
      " {'freq': 1, 'id': 655, 'word': '607'},\n",
      " {'freq': 1, 'id': 1474, 'word': '3065'},\n",
      " {'freq': 1, 'id': 1759, 'word': '1890'},\n",
      " {'freq': 1, 'id': 1826, 'word': '7336'},\n",
      " {'freq': 1, 'id': 1836, 'word': '2105'},\n",
      " {'freq': 1, 'id': 2002, 'word': '530'},\n",
      " {'freq': 1, 'id': 2129, 'word': '3192'},\n",
      " {'freq': 1, 'id': 2180, 'word': '637'},\n",
      " {'freq': 1, 'id': 2277, 'word': '7542'},\n",
      " {'freq': 1, 'id': 2384, 'word': '5711'},\n",
      " {'freq': 1, 'id': 2385, 'word': '6369'},\n",
      " {'freq': 1, 'id': 2445, 'word': '2067'},\n",
      " {'freq': 1, 'id': 2501, 'word': '3190'},\n",
      " {'freq': 1, 'id': 2502, 'word': '3243'},\n",
      " {'freq': 1, 'id': 2509, 'word': '6664'},\n",
      " {'freq': 1, 'id': 2519, 'word': '2521'},\n",
      " {'freq': 1, 'id': 2567, 'word': '3219'},\n",
      " {'freq': 1, 'id': 2657, 'word': '7124'},\n",
      " {'freq': 1, 'id': 2667, 'word': '5959'},\n",
      " {'freq': 1, 'id': 2682, 'word': '181'},\n",
      " {'freq': 1, 'id': 2690, 'word': '6067'},\n",
      " {'freq': 1, 'id': 2700, 'word': '1072'},\n",
      " {'freq': 1, 'id': 2715, 'word': '4539'},\n",
      " {'freq': 1, 'id': 2716, 'word': '4870'},\n",
      " {'freq': 1, 'id': 2718, 'word': '612'},\n",
      " {'freq': 1, 'id': 2719, 'word': '1572'},\n",
      " {'freq': 1, 'id': 2723, 'word': '5496'},\n",
      " {'freq': 1, 'id': 2730, 'word': '5952'},\n",
      " {'freq': 1, 'id': 2758, 'word': '7353'},\n",
      " {'freq': 1, 'id': 2784, 'word': '5186'},\n",
      " {'freq': 1, 'id': 2833, 'word': '1128'},\n",
      " {'freq': 1, 'id': 2842, 'word': '5431'},\n",
      " {'freq': 1, 'id': 2869, 'word': '4846'},\n",
      " {'freq': 1, 'id': 2906, 'word': '5732'},\n",
      " {'freq': 1, 'id': 2938, 'word': '4806'},\n",
      " {'freq': 1, 'id': 2941, 'word': '3323'},\n",
      " {'freq': 1, 'id': 2942, 'word': '3542'},\n",
      " {'freq': 1, 'id': 2944, 'word': '4182'},\n",
      " {'freq': 1, 'id': 2945, 'word': '4423'},\n",
      " {'freq': 1, 'id': 2976, 'word': '1173'},\n",
      " {'freq': 1, 'id': 2981, 'word': '1700'},\n",
      " {'freq': 1, 'id': 2987, 'word': '1178'},\n",
      " {'freq': 1, 'id': 2995, 'word': '6194'},\n",
      " {'freq': 1, 'id': 3009, 'word': '1044'},\n",
      " {'freq': 1, 'id': 3050, 'word': '4763'},\n",
      " {'freq': 1, 'id': 3086, 'word': '3693'},\n",
      " {'freq': 1, 'id': 3092, 'word': '700'},\n",
      " {'freq': 1, 'id': 3093, 'word': '959'},\n",
      " {'freq': 1, 'id': 3104, 'word': '2904'},\n",
      " {'freq': 1, 'id': 3118, 'word': '2078'},\n",
      " {'freq': 1, 'id': 3142, 'word': '123'},\n",
      " {'freq': 1, 'id': 3145, 'word': '3677'},\n",
      " {'freq': 1, 'id': 3152, 'word': '1573'},\n",
      " {'freq': 1, 'id': 3155, 'word': '4254'},\n",
      " {'freq': 1, 'id': 3158, 'word': '5990'},\n",
      " {'freq': 1, 'id': 3170, 'word': '7216'},\n",
      " {'freq': 1, 'id': 3201, 'word': '539'},\n",
      " {'freq': 1, 'id': 3204, 'word': '7093'},\n",
      " {'freq': 1, 'id': 3206, 'word': '1802'},\n",
      " {'freq': 1, 'id': 3212, 'word': '864'},\n",
      " {'freq': 1, 'id': 3215, 'word': '2132'},\n",
      " {'freq': 1, 'id': 3216, 'word': '6475'},\n",
      " {'freq': 1, 'id': 3221, 'word': '3814'},\n",
      " {'freq': 1, 'id': 3234, 'word': '793'},\n",
      " {'freq': 1, 'id': 3240, 'word': '4774'},\n",
      " {'freq': 1, 'id': 3241, 'word': '6805'},\n",
      " {'freq': 1, 'id': 3260, 'word': '1762'},\n",
      " {'freq': 1, 'id': 3261, 'word': '2894'},\n",
      " {'freq': 1, 'id': 3262, 'word': '4093'},\n",
      " {'freq': 1, 'id': 3263, 'word': '6240'},\n",
      " {'freq': 1, 'id': 3267, 'word': '370'},\n",
      " {'freq': 1, 'id': 3272, 'word': '476'},\n",
      " {'freq': 1, 'id': 3273, 'word': '102'},\n",
      " {'freq': 1, 'id': 3281, 'word': '5353'},\n",
      " {'freq': 1, 'id': 3282, 'word': '7248'},\n",
      " {'freq': 1, 'id': 3284, 'word': '295'},\n",
      " {'freq': 1, 'id': 3295, 'word': '731'},\n",
      " {'freq': 1, 'id': 3300, 'word': '3535'},\n",
      " {'freq': 1, 'id': 3311, 'word': '5738'},\n",
      " {'freq': 1, 'id': 3313, 'word': '2026'},\n",
      " {'freq': 1, 'id': 3327, 'word': '1882'},\n",
      " {'freq': 1, 'id': 3328, 'word': '4549'},\n",
      " {'freq': 1, 'id': 3329, 'word': '6'},\n",
      " {'freq': 1, 'id': 3330, 'word': '2239'},\n",
      " {'freq': 1, 'id': 3331, 'word': '2250'},\n",
      " {'freq': 1, 'id': 3334, 'word': '7005'},\n",
      " {'freq': 1, 'id': 3339, 'word': '4295'},\n",
      " {'freq': 1, 'id': 3343, 'word': '2110'},\n",
      " {'freq': 1, 'id': 3349, 'word': '4229'},\n",
      " {'freq': 1, 'id': 3351, 'word': '5672'},\n",
      " {'freq': 1, 'id': 3354, 'word': '6460'},\n",
      " {'freq': 1, 'id': 3368, 'word': '5366'},\n",
      " {'freq': 1, 'id': 3370, 'word': '1056'},\n",
      " {'freq': 1, 'id': 3374, 'word': '1455'},\n",
      " {'freq': 1, 'id': 3375, 'word': '1990'},\n",
      " {'freq': 1, 'id': 3381, 'word': '5039'},\n",
      " {'freq': 1, 'id': 3383, 'word': '2973'},\n",
      " {'freq': 1, 'id': 3388, 'word': '1851'},\n",
      " {'freq': 1, 'id': 3389, 'word': '4279'},\n",
      " {'freq': 1, 'id': 3391, 'word': '2527'},\n",
      " {'freq': 1, 'id': 3396, 'word': '2943'},\n",
      " {'freq': 1, 'id': 3402, 'word': '1444'},\n",
      " {'freq': 1, 'id': 3403, 'word': '3115'},\n",
      " {'freq': 1, 'id': 3409, 'word': '5376'},\n",
      " {'freq': 1, 'id': 3418, 'word': '1175'},\n",
      " {'freq': 1, 'id': 3421, 'word': '4875'},\n",
      " {'freq': 1, 'id': 3429, 'word': '4610'},\n",
      " {'freq': 1, 'id': 3431, 'word': '682'},\n",
      " {'freq': 1, 'id': 3432, 'word': '5372'},\n",
      " {'freq': 1, 'id': 3434, 'word': '2155'},\n",
      " {'freq': 1, 'id': 3435, 'word': '2679'},\n",
      " {'freq': 1, 'id': 3439, 'word': '4300'},\n",
      " {'freq': 1, 'id': 3442, 'word': '1984'},\n",
      " {'freq': 1, 'id': 3446, 'word': '749'},\n",
      " {'freq': 1, 'id': 3448, 'word': '3936'},\n",
      " {'freq': 1, 'id': 3450, 'word': '4667'},\n",
      " {'freq': 1, 'id': 3459, 'word': '6782'},\n",
      " {'freq': 1, 'id': 3460, 'word': '2927'},\n",
      " {'freq': 1, 'id': 3464, 'word': '106'},\n",
      " {'freq': 1, 'id': 3465, 'word': '1475'},\n",
      " {'freq': 1, 'id': 3466, 'word': '360'},\n",
      " {'freq': 1, 'id': 3468, 'word': '4419'},\n",
      " {'freq': 1, 'id': 3472, 'word': '3900'},\n",
      " {'freq': 1, 'id': 3473, 'word': '835'},\n",
      " {'freq': 1, 'id': 3474, 'word': '528'},\n",
      " {'freq': 1, 'id': 3479, 'word': '647'},\n",
      " {'freq': 1, 'id': 3481, 'word': '709'},\n",
      " {'freq': 1, 'id': 3482, 'word': '4172'},\n",
      " {'freq': 1, 'id': 3483, 'word': '6406'},\n",
      " {'freq': 1, 'id': 3485, 'word': '7176'},\n",
      " {'freq': 1, 'id': 3487, 'word': '1383'},\n",
      " {'freq': 1, 'id': 3490, 'word': '6462'},\n",
      " {'freq': 1, 'id': 3492, 'word': '2905'},\n",
      " {'freq': 1, 'id': 3494, 'word': '963'},\n",
      " {'freq': 1, 'id': 3497, 'word': '2083'},\n",
      " {'freq': 1, 'id': 3498, 'word': '2228'},\n",
      " {'freq': 1, 'id': 3499, 'word': '692'},\n",
      " {'freq': 1, 'id': 3500, 'word': '949'},\n",
      " {'freq': 1, 'id': 3502, 'word': '4395'},\n",
      " {'freq': 1, 'id': 3504, 'word': '5772'},\n",
      " {'freq': 1, 'id': 3506, 'word': '3903'},\n",
      " {'freq': 1, 'id': 3508, 'word': '6918'},\n",
      " {'freq': 1, 'id': 3512, 'word': '1575'},\n",
      " {'freq': 1, 'id': 3515, 'word': '4850'},\n",
      " {'freq': 1, 'id': 3517, 'word': '3754'},\n",
      " {'freq': 1, 'id': 3518, 'word': '6213'},\n",
      " {'freq': 1, 'id': 3520, 'word': '1948'},\n",
      " {'freq': 1, 'id': 3521, 'word': '3075'},\n",
      " {'freq': 1, 'id': 3523, 'word': '5685'},\n",
      " {'freq': 1, 'id': 3524, 'word': '6113'},\n",
      " {'freq': 1, 'id': 3525, 'word': '6214'},\n",
      " {'freq': 1, 'id': 3527, 'word': '6563'},\n",
      " {'freq': 1, 'id': 3529, 'word': '7318'},\n",
      " {'freq': 1, 'id': 3530, 'word': '199'},\n",
      " {'freq': 1, 'id': 3531, 'word': '5503'},\n",
      " {'freq': 1, 'id': 3532, 'word': '1353'},\n",
      " {'freq': 1, 'id': 3533, 'word': '3767'},\n",
      " {'freq': 1, 'id': 3534, 'word': '7063'},\n",
      " {'freq': 1, 'id': 3535, 'word': '5337'},\n",
      " {'freq': 1, 'id': 3536, 'word': '6545'},\n",
      " {'freq': 1, 'id': 3538, 'word': '2284'},\n",
      " {'freq': 1, 'id': 3539, 'word': '2599'},\n",
      " {'freq': 1, 'id': 3540, 'word': '6231'},\n",
      " {'freq': 1, 'id': 3541, 'word': '1823'},\n",
      " {'freq': 1, 'id': 3543, 'word': '2794'},\n",
      " {'freq': 1, 'id': 3544, 'word': '2877'},\n",
      " {'freq': 1, 'id': 3545, 'word': '3655'},\n",
      " {'freq': 1, 'id': 3546, 'word': '390'},\n",
      " {'freq': 1, 'id': 3547, 'word': '5135'},\n",
      " {'freq': 1, 'id': 3548, 'word': '5855'},\n",
      " {'freq': 1, 'id': 3549, 'word': '6013'},\n",
      " {'freq': 1, 'id': 3550, 'word': '6534'},\n",
      " {'freq': 1, 'id': 3551, 'word': '6576'},\n",
      " {'freq': 1, 'id': 3552, 'word': '6659'},\n",
      " {'freq': 1, 'id': 3553, 'word': '952'},\n",
      " {'freq': 1, 'id': 3554, 'word': '2608'},\n",
      " {'freq': 1, 'id': 3555, 'word': '143'},\n",
      " {'freq': 1, 'id': 3556, 'word': '1864'},\n",
      " {'freq': 1, 'id': 3557, 'word': '1358'},\n",
      " {'freq': 1, 'id': 3558, 'word': '3306'},\n",
      " {'freq': 1, 'id': 3559, 'word': '2951'},\n",
      " {'freq': 1, 'id': 3560, 'word': '5563'},\n",
      " {'freq': 2, 'id': 211, 'word': '473'},\n",
      " {'freq': 2, 'id': 387, 'word': '7440'},\n",
      " {'freq': 2, 'id': 478, 'word': '5106'},\n",
      " {'freq': 2, 'id': 625, 'word': '5151'},\n",
      " {'freq': 2, 'id': 687, 'word': '7381'},\n",
      " {'freq': 2, 'id': 693, 'word': '7523'},\n",
      " {'freq': 2, 'id': 695, 'word': '805'},\n",
      " {'freq': 2, 'id': 794, 'word': '5019'},\n",
      " {'freq': 2, 'id': 979, 'word': '6107'},\n",
      " {'freq': 2, 'id': 997, 'word': '7191'},\n",
      " {'freq': 2, 'id': 1219, 'word': '3595'},\n",
      " {'freq': 2, 'id': 1264, 'word': '5963'},\n",
      " {'freq': 2, 'id': 1410, 'word': '5901'},\n",
      " {'freq': 2, 'id': 1424, 'word': '79'},\n",
      " {'freq': 2, 'id': 1445, 'word': '232'},\n",
      " {'freq': 2, 'id': 1533, 'word': '710'},\n",
      " {'freq': 2, 'id': 1563, 'word': '6753'},\n",
      " {'freq': 2, 'id': 1613, 'word': '2167'},\n",
      " {'freq': 2, 'id': 1685, 'word': '3443'},\n",
      " {'freq': 2, 'id': 1726, 'word': '1727'},\n",
      " {'freq': 2, 'id': 1778, 'word': '3368'},\n",
      " {'freq': 2, 'id': 1919, 'word': '2321'},\n",
      " {'freq': 2, 'id': 1923, 'word': '4177'},\n",
      " {'freq': 2, 'id': 1967, 'word': '674'},\n",
      " {'freq': 2, 'id': 2047, 'word': '7203'},\n",
      " {'freq': 2, 'id': 2058, 'word': '1028'},\n",
      " {'freq': 2, 'id': 2073, 'word': '4534'},\n",
      " {'freq': 2, 'id': 2075, 'word': '4895'},\n",
      " {'freq': 2, 'id': 2084, 'word': '6174'},\n",
      " {'freq': 2, 'id': 2126, 'word': '1109'},\n",
      " {'freq': 2, 'id': 2194, 'word': '4623'},\n",
      " {'freq': 2, 'id': 2196, 'word': '5933'},\n",
      " {'freq': 2, 'id': 2203, 'word': '2775'},\n",
      " {'freq': 2, 'id': 2221, 'word': '1532'},\n",
      " {'freq': 2, 'id': 2254, 'word': '1812'},\n",
      " {'freq': 2, 'id': 2255, 'word': '1901'},\n",
      " {'freq': 2, 'id': 2270, 'word': '5287'},\n",
      " {'freq': 2, 'id': 2293, 'word': '4723'},\n",
      " {'freq': 2, 'id': 2307, 'word': '1900'},\n",
      " {'freq': 2, 'id': 2331, 'word': '240'},\n",
      " {'freq': 2, 'id': 2352, 'word': '1966'},\n",
      " {'freq': 2, 'id': 2353, 'word': '2897'},\n",
      " {'freq': 2, 'id': 2360, 'word': '7545'},\n",
      " {'freq': 2, 'id': 2363, 'word': '3246'},\n",
      " {'freq': 2, 'id': 2376, 'word': '378'},\n",
      " {'freq': 2, 'id': 2379, 'word': '4975'},\n",
      " {'freq': 2, 'id': 2383, 'word': '5494'},\n",
      " {'freq': 2, 'id': 2409, 'word': '1840'},\n",
      " {'freq': 2, 'id': 2418, 'word': '6749'},\n",
      " {'freq': 2, 'id': 2425, 'word': '1235'},\n",
      " {'freq': 2, 'id': 2426, 'word': '4584'},\n",
      " {'freq': 2, 'id': 2438, 'word': '1478'},\n",
      " {'freq': 2, 'id': 2441, 'word': '107'},\n",
      " {'freq': 2, 'id': 2456, 'word': '4937'},\n",
      " {'freq': 2, '"
     ]
    },
    {
     "data": {
      "text/html": [
       "<b>limit_output extension: Maximum message size of 10000 exceeded with 47096 characters</b>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "if (os.path.exists(outputPath+fileName+\".dict\")):\n",
    "    dictionary = corpora.Dictionary.load(outputPath+fileName+\".dict\")\n",
    "    corpus = corpora.MmCorpus(outputPath+fileName+'.mm')\n",
    "    print(\"Used files generated from first tutorial\")\n",
    "else:\n",
    "    print(\"Please run first tutorial to generate data set\")\n",
    "\n",
    "PrintDictionary(dictionary,dict_dataframe_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(feature_names):1163\n"
     ]
    }
   ],
   "source": [
    "tfidfVectorizer = TfidfVectorizer(max_df=0.95,min_df=0.05)\n",
    "x = tfidfVectorizer.fit_transform(alldocument)  \n",
    "feature_names = tfidfVectorizer.get_feature_names()\n",
    "# print(\"len(feature_names):{0}\".format(len(feature_names)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainDf['text_clean'] =  trainDf['text'].apply(text_feature_ex,**{'feature_names':feature_names,'minFreq':1,'sep':\" \"})\n",
    "trainDf.columns = ['label','old_text','text']\n",
    "trainDf[['label','text']].to_csv(trainNewPath,encoding=\"utf-8\",sep=\"\\t\",index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "testDf['text_clean'] =  testDf['text'].apply(text_feature_ex,**{'feature_names':feature_names,'minFreq':1,'sep':\" \"})\n",
    "testDf.columns = ['old_text','text']\n",
    "testDf[['text']].to_csv(testNewPath,encoding=\"utf-8\",sep=\"\\t\",index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
